#############################################################
## Replication file for Ban, Palmer and Schneer 2019       ##
## file: create_fe.R                                       ##
## date: 5/16/2019                                         ##
#############################################################

#Install relevant R packages
#install.packages(c('foreign','data.table','reshape2','glmnet','doMC'))

rm(list=ls())

#Load packages
require(foreign)
require(data.table)
require(reshape2)
require(glmnet)
require(doMC)
registerDoMC(cores=16)

#Set working directory
setwd("/nfs/home/B/bschneer/shared_space/bschneer/lobbying_replication")

#Set parameters for whether to estimate w/ lobbying firm FEs for all firms and/or w/ covariate tracking number of employees on lobbying report
#Set to T or F

include.firm.fe <- F
include.firm.N <- F

#Read in data from Center for Responsive Politics
f1<-read.dta("inputs/lobbyists.dta")
setDT(f1)
f2<-read.dta("inputs/lobbying_agencies.dta")
setDT(f2)

#######################################################################################
#Determine time period we want to estimate FE for -- comment out other time periods ###
#######################################################################################

#all
time.periods <- data.table(year=2000:2014,period=1)

#hilo
#time.periods <- data.table(year=2000:2014,period=c(1,2,2,2,1,1,1,1,2,2,2,2,2,2,1))

#50
#time.periods <- data.table(year=2000:2014,period=c(1,2,2,2,3,3,3,3,4,4,4,4,4,4,5))

#75
#time.periods <- data.table(year=2000:2014,period=c(1,2,2,2,3,3,3,3,4,4,4,4,4,5,5))

#yearly
#time.periods <- data.table(year=2000:2014,period=2000:2014)

#Merge time period to lobbying data
f1 <- time.periods[f1,on="year",nomatch=0]


#Gemerate summary stats on number of lobbying reports and unique lobbyists

f1[,length(unique(trans_id))]
f1[,length(unique(lobbyist_id))]

##ID lobbyists who don't work very much, so we can fold them into a firm FE

lobbyist.N.yr<-f1[,.N,by=list(lobbyist_id,period)]

#Combine lobbyists to firm FE's for people who never work on more than 12 lobbying reports; include all former MCs

former.mc<-fread("inputs/former_mc.csv")

drop.lobbyists.yr<-lobbyist.N.yr[N<=12 & !(lobbyist_id %in% former.mc$lobbyist_id) ,.(lobbyist_id,period,drop=T)]

f1[,registrant_id:=paste("r",.GRP,sep="_"),keyby=registrant]

f1.drop.lobbyists.yr <- drop.lobbyists.yr[f1,on=c('lobbyist_id','period')]
f1.drop.lobbyists.yr[is.na(drop),drop:=F]

counts.yr <- copy(f1.drop.lobbyists.yr)

counts.yr[,lobbyist_id2:=lobbyist_id]
counts.yr[,registrant_id2:=registrant_id]

counts.yr[drop==T,lobbyist_id2:=""]

counts.yr<-counts.yr[,.(.N,lobbyists=paste(sort(lobbyist_id2),collapse="|"),registrants=paste(sort(unique(registrant_id2)),collapse="")),by=.(trans_id,client,period)]


while(counts.yr[,length(grep("\\|\\|",lobbyists))>0]){
counts.yr[grep("\\|\\|",lobbyists),lobbyists:=gsub("\\|\\|","|",lobbyists)]
counts.yr[grep("\\|\\|",registrants),registrants:=gsub("\\|\\|","|",registrants)]

}

counts.yr[nchar(lobbyists)<=1,lobbyists:=""]
counts.yr[nchar(registrants)<=1,registrants:=""]

counts.yr[substr(lobbyists,1,1)=="|",lobbyists:=substr(lobbyists,2,nchar(lobbyists))]
counts.yr <- unique(counts.yr[,.(N,lobbyists,client,registrants,period)])

#Prepare data set for estimating lobbyist FEs

#Create Lobbyist Indicator variables -- must do year by year due to memory limits

f1.dummies.yr<-list()
for (yr in 2000:2014){

lobbyists.yr<-dcast.data.table(f1.drop.lobbyists.yr[year==yr & drop==F], trans_id ~ lobbyist_id , fun=length)

if(include.firm.fe==FALSE){

registrants.yr<-dcast.data.table(f1.drop.lobbyists.yr[year==yr & drop==T], trans_id ~ registrant_id , fun=function(x) as.numeric(length(x)>0))

}else{

registrants.yr<-dcast.data.table(f1.drop.lobbyists.yr[year==yr], trans_id ~ registrant_id , fun=function(x) as.numeric(length(x)>0))

}

out.yr<-merge(lobbyists.yr,registrants.yr,by="trans_id",all=T)

f1.dummies.yr[[paste(yr)]]<-out.yr

f1.dummies.yr[[paste(yr)]]<-f1.dummies.yr[[paste(yr)]][,lapply(.SD,sum),by=list(trans_id),.SDcols=names(f1.dummies.yr[[paste(yr)]])[-1]]
}

#Remove objects to save memory

rm(list=c("lobbyists.yr","registrants.yr","out.yr","f1.drop.lobbyists.yr"))

#Assemble dummy variables calculated for each year

f1b.dummies.yr<-rbindlist(f1.dummies.yr,fill=T)

#Set NAs to zeroes
replace.nas = function(DT) {
    for (j in seq_len(ncol(DT)))
        set(DT,which(is.na(DT[[j]])),j,0)
}

replace.nas(f1b.dummies.yr)

#Create covariates
f2b<-unique(f2[,list(trans_id,agency_id)])

f2b[,cong_lobby:=as.numeric(agency_id %in% c(1,2))]
f2b[,agency_lobby:=as.numeric(agency_id > 2)]

f2b[,agency_id:=NULL]

f2c<-f2b[,.(cong_lobby=max(cong_lobby,na.rm=T),agency_lobby=max(agency_lobby,na.rm=T)),keyby=trans_id]

#assemble covariates and lobbyist dummies
f3<-merge(unique(f1[,list(year,trans_id,amount,client)]),f2c,all.x=T,by="trans_id")
rm(list=ls()[!(ls()%in%c("f3","f1b.dummies","drop.lobbyists","f1b.dummies.yr","drop.lobbyists.yr","time.periods","counts","counts.yr","include.firm.fe","include.firm.N"))])

f4.yr<-f1b.dummies.yr[f3,on='trans_id']

rm(list=ls()[!(ls()%in%c("f4","drop.lobbyists","f4.yr","drop.lobbyists.yr","time.periods","counts","counts.yr","include.firm.fe","include.firm.N"))])

f5.yr<-f4.yr[,list(amount=sum(amount,na.rm=T),cong_lobby=max(cong_lobby,na.rm=T),agency_lobby=max(agency_lobby,na.rm=T)),keyby=c(names(f4.yr)[!(names(f4.yr) %in% c("amount","trans_id","cong_lobby","agency_lobby"))])]

#Adjust earnings to 2014 Dollars
adjustment<-data.table(year=2000:2014,adj=c(1.37,1.34,1.32,1.29,1.25,1.21,1.17,1.14,1.1,1.1,1.09,1.05,1.03,1.02,1))

adjust.yr<-adjustment$adj[match(f5.yr$year,adjustment$year)]

f5.yr[,amount_adjusted:=amount*adjust.yr]

rm(list=ls()[!(ls()%in%c("f5","drop.lobbyists","f5.yr","drop.lobbyists.yr","time.periods","counts","counts.yr","include.firm.fe","include.firm.N"))])

#Set NAs
for (j in grep("lobby",names(f5.yr))){
 set(f5.yr,which(f5.yr[[j]]==-Inf),j,NA)
}

#Remove anyone we do not have earnings data for

f5.yr<-f5.yr[!(is.na(amount_adjusted))]

#Identify covariates by group -- Lobbyist FE, outcome, and year dummies

X.vars.yr<-grep("Y0|YX0|r_",names(f5.yr),value=T)
y.vars.yr<-"amount_adjusted"
Z.vars.yr<-grep("year",names(f5.yr),value=T)

#Recover list of registrants and lobbyists for each row
f5.yr[,lobbyists:=""]; f5.yr[,registrants:=""]

z <- 1
for (j in grep("Y0|YX0",names(f5.yr),value=T)){
if(z%%100==0) print(z)
f5.yr[get(j)==1,lobbyists:=paste(lobbyists,j,sep="|")]
z<-z+1
}

z <- 1
for (j in grep("r_",names(f5.yr),value=T)){
if(z%%100==0) print(z)
f5.yr[get(j)==1,registrants:=paste(registrants,j,sep="|")]
z<-z+1
}

#Count number of lobbyists, registrants
f5.yr[nchar(lobbyists)-nchar(gsub("\\|","",lobbyists))==1,lobbyists:=gsub("\\|","",lobbyists)]
f5.yr[nchar(registrants)-nchar(gsub("\\|","",registrants))==1,registrants:=gsub("\\|","",registrants)]
f5.yr[substr(lobbyists,1,1)=="|",lobbyists:=substr(lobbyists,2,nchar(lobbyists))]

f5.yr[,`:=`(lobbyists=trimws(lobbyists),registrants=trimws(registrants))]


#Create year variable and merge on time period codes

f5.yr[,year:=as.integer(as.character(year))]

f5.yr <- time.periods[f5.yr,on="year"]

#Reorder list of lobbyists in each row alphabetically
f5.yr[,lobbyists:=sapply(strsplit(lobbyists,"\\|"),function(x) paste(sort(x),collapse="|"))]

#merge on counts of lobbyists and registrants
f5.yr <- counts.yr[f5.yr,on=c('lobbyists','registrants','client','period')]

#Estimate lobbyist FEs for each period

#create empty lists to hold results
out<-list()
out.lim<-list()

#loop over time periods
for (time.period in unique(time.periods$period)){
print(time.period)

#get list of X variables
X.vars<-grep("Y0|YX0|r_",names(f5.yr[period==time.period]),value=T)

drop.zeros <- f5.yr[period==time.period,X.vars,with=F][,sapply(.SD,function(x) any(x>0))]
X.vars <- X.vars[drop.zeros]

#Z variables
Z.vars<-grep("year",names(f5.yr[period==time.period]),value=T)

if(include.firm.N==T){
Z.vars <- c(Z.vars,"N")

}

#Create Model matrix
X<-sparse.model.matrix(~-1+.,f5.yr[period==time.period,c(X.vars,Z.vars),with=F],drop.unused.levels=T)
y<-as.numeric(unlist(f5.yr[period==time.period,y.vars.yr,with=F]))

penalty<-rep(1,ncol(X))
penalty[colnames(X) %in% Z.vars | grepl("year",colnames(X))]<-0

lowlimit<-rep(0,ncol(X))
lowlimit[(colnames(X) %in% Z.vars) | grepl("year",colnames(X))]<-(-Inf)

foldid<-sample(1:10,size=length(y),replace=TRUE)

if(include.firm.N==T){
keep.cols<-c(grep("Y|r_",colnames(X)),which(Z.vars==colnames(X)))
}else{

keep.cols<-c(grep("Y|r_",colnames(X)))

}

#Implement Ridge Regression
set.seed(02139)
cv.ridge <- tryCatch({cv.glmnet(X[,keep.cols],y,alpha=0,foldid=foldid,standardize=F,intercept=F,family="gaussian",parallel=TRUE,penalty.factor=penalty,lambda=sort(seq(0,500,1)/100,decreasing=T))}, error = function(e) simpleError("error"))

#In event of error in estimation, attempt again

e<-1
while(length(cv.ridge)==2){
print(paste("error",e))
if(cv.ridge$message=="error") cv.ridge <- tryCatch({cv.glmnet(X[,keep.cols],y,alpha=0,foldid=foldid,standardize=F,intercept=F,family="gaussian",parallel=TRUE,penalty.factor=penalty,lambda=sort(seq(0,500,1)/100,decreasing=T))}, error = function(e) simpleError("error"))
e<-e+1
if(e>10) break
}

#Implement ridge regression limiting FE estimates to minimum of zero
cv.ridge.lim <- tryCatch({cv.glmnet(X[,keep.cols],y,alpha=0,foldid=foldid,standardize=F,intercept=F,family="gaussian",parallel=TRUE,penalty.factor=penalty,lambda=sort(seq(0,500,1),decreasing=T)/100,lower.limits=lowlimit[keep.cols])}, error = function(e) simpleError("error"))

e2<-1
while(length(cv.ridge.lim)==2){
print(paste("error",e2))
if(cv.ridge.lim$message=="error") cv.ridge.lim <- tryCatch({cv.glmnet(X[,keep.cols],y,alpha=0,foldid=foldid,standardize=F,intercept=F,family="gaussian",parallel=TRUE,penalty.factor=penalty,lambda=sort(seq(0,500,1),decreasing=T)/100,lower.limits=lowlimit[keep.cols])}, error = function(e) simpleError("error"))
e2<-e2+1
if(e2>10) break
}

best_lambda <- cv.ridge$lambda.min
best_lambda.lim <- cv.ridge.lim$lambda.min

print(best_lambda)
print(best_lambda.lim)

fit.cv.ridge.coef<-coef(cv.ridge,s=best_lambda)
fit.cv.ridge.coef.lim<-coef(cv.ridge.lim,s=best_lambda.lim)

out[[time.period]]<-data.table(lobbyist_id=row.names(fit.cv.ridge.coef),FE=as.numeric(fit.cv.ridge.coef),period=time.period)
out.lim[[time.period]]<-data.table(lobbyist_id=row.names(fit.cv.ridge.coef.lim),FE=as.numeric(fit.cv.ridge.coef.lim),period=time.period)
}

out <- rbindlist(out)
out.lim <- rbindlist(out.lim)

###Adjusted report earnings and yearly earnings 
FE<-out[substr(lobbyist_id,1,1)%in%c("Y","r")]
FE.lim<-out.lim[substr(lobbyist_id,1,1)%in%c("Y","r")]

year.constant<-out[substr(lobbyist_id,1,4)%in%c("year")]
year.constant[,year:=as.numeric(substr(lobbyist_id,5,8))]
year.constant[,lobbyist_id:=NULL]
setnames(year.constant,"FE","Constant")

#output estimated lobbyist FEs by ID


#First put back together covariates (had to delete due to memory limits)
f0<-read.dta("inputs/lobbyists.dta")
setDT(f0)
f0[,registrant_id:=paste("r",.GRP,sep="_"),keyby=registrant]
time.periods[,year:=as.integer(as.character(year))]

f0 <- time.periods[f0,on='year']

#Merge on FE
f1<-merge(f0,FE,by=c("lobbyist_id","period"),all.x=T)[!is.na(period)]
f1<-merge(f1,FE,by.x=c("registrant_id","period"),by.y=c("lobbyist_id","period"),all.x=T)

f1<-merge(f1,year.constant,by=c("year","period"),all.x=T)

f1.lim<-merge(f0,FE.lim,by=c("lobbyist_id","period"),all.x=T)
f1.lim<-merge(f1.lim,FE.lim,by.x=c("registrant_id","period"),by.y=c("lobbyist_id","period"),all.x=T)
f1.lim<-merge(f1.lim,year.constant,by=c("year","period"),all.x=T)

setnames(f1,c("FE.x","FE.y"),c("lobbyist.FE","Firm.FE"))
setnames(f1.lim,c("FE.x","FE.y"),c("lobbyist.FE","Firm.FE"))

f1[,N:=.N,by=trans_id]
f1.lim[,N:=.N,by=trans_id]

f1<-f1[!(lobbyist_id %in% drop.lobbyists.yr) & year %in% 2000:2014 & !is.na(lobbyist.FE)]
f1.lim<-f1.lim[!(lobbyist_id %in% drop.lobbyists.yr) & year %in% 2000:2014 & !is.na(lobbyist.FE)]

f1[,N.omit:=N-.N,by=trans_id]
f1.lim[,N.omit:=N-.N,by=trans_id]

adjustment<-data.table(year=2000:2014,adj=c(1.37,1.34,1.32,1.29,1.25,1.21,1.17,1.14,1.1,1.1,1.09,1.05,1.03,1.02,1))

adjust<-adjustment$adj[match(f1$year,adjustment$year)]
adjust.lim<-adjustment$adj[match(f1.lim$year,adjustment$year)]

f1[,amount_adjusted:=amount*adjust]
f1.lim[,amount_adjusted:=amount*adjust.lim]


#Create FE weighted measures
f1[is.na(Constant),Constant:=0]
f1.lim[is.na(Constant),Constant:=0]

f1[,lobbyist.FE.rescaled:=lobbyist.FE+Constant]

f1[,`:=`(amount.per.lobbyist=amount_adjusted/N,amount.weighted=amount_adjusted*(lobbyist.FE.rescaled/sum(as.numeric(N.omit>0)*(Firm.FE[1]+Constant[1]),sum(lobbyist.FE.rescaled,na.rm=T),na.rm=T))),by=trans_id]
f1.lim[,`:=`(amount.per.lobbyist=amount_adjusted/N,amount.weighted=amount_adjusted*(lobbyist.FE/sum(as.numeric(N.omit>0)*(Firm.FE[1]),sum(lobbyist.FE,na.rm=T),na.rm=T))),by=trans_id]
f1[is.na(amount.weighted),amount.weighted:=0]
f1.lim[is.na(amount.weighted),amount.weighted:=0]

f2<-f1[,.(lobbyist=lobbyist[1],amount=sum(amount_adjusted),amount.per.lobbyist=sum(amount.per.lobbyist),amount.weighted=sum(amount.weighted),lobbying.reports=.N),by=.(lobbyist_id,period)]
f2.lim<-f1.lim[,.(lobbyist=lobbyist[1],amount.lim=sum(amount_adjusted),amount.per.lobbyist.lim=sum(amount_adjusted)/sum(N),amount.weighted.lim=sum(amount.weighted),lobbying.reports.lim=.N),by=.(lobbyist_id,period)]

lobbyist_fe<-out[substr(lobbyist_id,1,1)=="Y"]

lobbyist.final0<-merge(out[substr(lobbyist_id,1,1)=="Y"],out.lim[substr(lobbyist_id,1,1)=="Y"],by=c("lobbyist_id","period"))
setnames(lobbyist.final0,c("FE.x","FE.y"),c("FE","FE.lim"))

lobbyist.final<-merge(lobbyist.final0,f2,by=c("lobbyist_id","period"),all.x=T)
lobbyist.final<-merge(lobbyist.final,f2.lim,by=c("lobbyist_id","period"),all.x=T)

lobbyist.final[,`:=`(lobbying.reports.lim=NULL,lobbyist.x=NULL)]
setnames(lobbyist.final,"lobbyist.y","lobbyist")


#Export results

#Must choose specific file to output based on previous selections

#split into hi and lo uncertainty
#time.periods <- data.table(year=2000:2014,period=c(1,2,2,2,1,1,1,1,1,1,1,1,1,1,2))
#write.csv(lobbyist.final,file="usr_gen/lobbyist_final_hilo.csv",row.names=F)

#all years pooled
write.csv(lobbyist.final,file="usr_gen/lobbyist_final_all_covs.csv",row.names=F)
#write.csv(lobbyist.final,file="usr_gen/lobbyist_final_all.csv",row.names=F)

#yearly
#write.csv(lobbyist.final,file="lobbyist_final_yearly.csv",row.names=F)

#Years classified as either Top vs. bottom half of Uncertainty
#time.periods <- data.table(year=2000:2014,period=c(1,2,2,2,3,3,3,3,4,4,4,4,4,4,5))
#write.csv(lobbyist.final,file="usr_gen/lobbyist_final_50.csv",row.names=F)

#Years classified as either Top 25 vs. bottom 75 of Uncertainty
#time.periods <- data.table(year=2000:2014,period=c(1,2,2,3,3,3,3,3,3,3,4,4,4,5,5))
#write.csv(lobbyist.final,file="usr_gen/lobbyist_final_75.csv",row.names=F)

#Years classified by alternative rubric
#time.periods <- data.table(year=2000:2014,period=c(1,2,2,2,3,3,3,3,4,4,4,4,4,5,5))
#write.csv(lobbyist.final,file="usr_gen/lobbyist_final_a.csv",row.names=F)

